import re

from chardet.universaldetector import UniversalDetector

import setting

'''by：shenhuawade
qq: 317909531
功能：读取解析doc文档，解析其中的文本内容

readtextMain(filename)
------------------------------------------
输入参数filename: 文件名 '游记：皮尔森啤酒博物馆.txt'
-------------------------------------------
输出参数list: 每行文字组成的列表

------------------------------------------
code/time:2019.1.5
'''

import docx
from docx import Document


# 核心函数
def dangerWordFind(word):
    # 初步清洗

    for danger in setting.DangerWord:
        if len(word) <= 6:  # 清洗长度小于3的句子
            return False
        elif danger in word:
            return False

    return True


def getDocxWorList(filename):
    filename = r'[电影岛赏] - 2021-01-12 这才是我2021开年最想看到的电影.docx'
    document = Document(filename)
    allPragraphsList = []
    # 输出每一段的内容
    for para in document.paragraphs:
        # print(para.text)
        pattern = r'。'
        result_list = re.split(pattern, para.text)
        # print(result_list)
        allPragraphsList.extend(result_list)

    userFulList = []

    for worditem in allPragraphsList:
        # print(worditem)
        if dangerWordFind(worditem) == True:
            userFulList.append(worditem[0:36])

    # print(userFulList)
    return userFulList




if __name__ == "__main__":
    filename = r'[电影岛赏] - 2021-01-12 这才是我2021开年最想看到的电影.docx'
    document = Document(filename)
    allPragraphsList = []
    # 输出每一段的内容
    for para in document.paragraphs:
        # print(para.text)
        pattern = r'。'
        result_list = re.split(pattern, para.text)
        # print(result_list)
        allPragraphsList.extend(result_list)

    userFulList = []

    for worditem in allPragraphsList:
        # print(worditem)
        if dangerWordFind(worditem) == True:
            userFulList.append(worditem)

    print(userFulList)
